import os import pathlib import sys import joblib from sklearn.utils import Bunch from sklearn.model_selection import train_test_split from functools import partial from .. import paths from ..log import logger from .fetch import fetch_file, get_dataset_filename, hash_file, unpack, infer_filename from .utils import partial_call_signature, serialize_partial, deserialize_partial, process_dataset_default from ..utils import load_json, save_json __all__ = [ 'Dataset', 'DataSource', 'add_datasource', 'del_datasource', 'available_datasets', 'available_datasources', 'process_datasources', ] _MODULE = sys.modules[__name__] _MODULE_DIR = pathlib.Path(os.path.dirname(os.path.abspath(__file__))) def available_datasets(dataset_path=None, keys_only=True): """Get a list of available datasets. Parameters ---------- dataset_path: path location of saved dataset files """ if dataset_path is None: dataset_path = paths['processed_data_path'] else: dataset_path = pathlib.Path(dataset_path) ds_dict = {} for dsfile in dataset_path.glob("*.metadata"): ds_stem = str(dsfile.stem) ds_meta = Dataset.load(ds_stem, data_path=dataset_path, metadata_only=True) ds_dict[ds_stem] = ds_meta if keys_only: return list(ds_dict.keys()) return ds_dict def process_datasources(datasources=None, action='process'): """Fetch, Unpack, and Process data sources. Parameters ---------- datasources: list or None List of data source names to process. if None, loops over all available data sources. action: {'fetch', 'unpack', 'process'} Action to perform on data sources: 'fetch': download raw files 'unpack': unpack raw files 'process': generate and cache Dataset objects """ if datasources is None: datasources = available_datasources() for dataset_name in datasources: dsrc = DataSource.from_name(dataset_name) logger.info(f'Running {action} on {dataset_name}') if action == 'fetch': dsrc.fetch() elif action == 'unpack': dsrc.unpack() elif action == 'process': ds = dsrc.process() logger.info(f'{dataset_name}: processed data has shape:{ds.data.shape}') def add_datasource(rawds): """Add a data source to the list of available data sources""" rawds_list, rds_file_fq = available_datasources(keys_only=False) rawds_list[rawds.name] = rawds.to_dict() save_json(rds_file_fq, rawds_list) def del_datasource(key): """Delete an entry in the datasource dict key: name of data source to delete """ datasource_list, datasource_file_fq = available_datasources(keys_only=False) del(datasource_list[key]) save_json(datasource_file_fq, datasource_list) def available_datasources(datasource_file='datasources.json', datasource_path=None, keys_only=True): """Returns the list of available datasets. Instructions for creating DataSources is stored in `datasources.json` by default. keys_only: boolean if True, return a list of available datasets (default) if False, return complete dataset dictionary and filename Returns ------- If `keys_only` is True: List of available dataset names else: Tuple (available_datasource_dict, available_datasource_dict_filename) """ if datasource_path is None: datasource_path = paths['catalog_path'] datasource_file_fq = pathlib.Path(datasource_path) / datasource_file if not datasource_file_fq.exists(): datasource_dict = {} logger.warning(f"No dataset file found: {datasource_file}") else: datasource_dict = load_json(datasource_file_fq) if keys_only: return list(datasource_dict.keys()) return datasource_dict, datasource_file_fq class Dataset(Bunch): def __init__(self, dataset_name=None, data=None, target=None, metadata=None, update_hashes=True, **kwargs): """ Object representing a dataset object. Notionally compatible with scikit-learn's Bunch object dataset_name: string (required) key to use for this dataset data: Data: (usually np.array or np.ndarray) target: np.array Either classification target or label to be used. for each of the points in `data` metadata: dict Data about the object. Key fields include `license_txt` and `descr` update_hashes: If True, update the data/target hashes in the Metadata. """ super().__init__(**kwargs) if dataset_name is None: if metadata is not None and metadata.get("dataset_name", None) is not None: dataset_name = metadata['dataset_name'] else: raise Exception('dataset_name is required') if metadata is not None: self['metadata'] = metadata else: self['metadata'] = {} self['metadata']['dataset_name'] = dataset_name self['data'] = data self['target'] = target if update_hashes: data_hashes = self.get_data_hashes() self['metadata'] = {**self['metadata'], **data_hashes} def __getattribute__(self, key): if key.isupper(): try: return self['metadata'][key.lower()] except: raise AttributeError(key) else: return super().__getattribute__(key) def __setattr__(self, key, value): if key.isupper(): self['metadata'][key.lower()] = value elif key == 'name': self['metadata']['dataset_name'] = value else: super().__setattr__(key, value) def __str__(self): s = f"<Dataset: {self.name}" if self.get('data', None) is not None: shape = getattr(self.data, 'shape', 'Unknown') s += f", data.shape={shape}" if self.get('target', None) is not None: shape = getattr(self.target, 'shape', 'Unknown') s += f", target.shape={shape}" meta = self.get('metadata', {}) if meta: s += f", metadata={list(meta.keys())}" s += ">" return s @property def name(self): return self['metadata'].get('dataset_name', None) @name.setter def name(self, val): self['metadata']['dataset_name'] = val @property def has_target(self): return self['target'] is not None @classmethod def load(cls, file_base, data_path=None, metadata_only=False): """Load a dataset must be present in dataset.json""" if data_path is None: data_path = paths['processed_data_path'] else: data_path = pathlib.Path(data_path) if metadata_only: metadata_fq = data_path / f'{file_base}.metadata' with open(metadata_fq, 'rb') as fd: meta = joblib.load(fd) return meta with open(data_path / f'{file_base}.dataset', 'rb') as fd: ds = joblib.load(fd) return ds @classmethod def from_datasource(cls, dataset_name, cache_path=None, fetch_path=None, force=False, unpack_path=None, **kwargs): '''Creates Dataset object from a named DataSource. Dataset will be cached after creation. Subsequent calls with matching call signature will return this cached object. Parameters ---------- dataset_name: Name of dataset to load. see `available_datasources()` for the current list cache_path: path Directory to search for Dataset cache files fetch_path: path Directory to download raw files into force: boolean If True, always regenerate the dataset. If false, a cached result can be returned unpack_path: path Directory to unpack raw files into **kwargs: Remaining keywords arguments are passed directly to DataSource.process(). See that docstring for details. Remaining keywords arguments are passed to the DataSource's `process()` method ''' dataset_list, _ = available_datasources(keys_only=False) if dataset_name not in dataset_list: raise Exception(f'Unknown Dataset: {dataset_name}') dsrc = DataSource.from_dict(dataset_list[dataset_name]) dsrc.fetch(fetch_path=fetch_path, force=force) dsrc.unpack(unpack_path=unpack_path, force=force) ds = dsrc.process(cache_path=cache_path, force=force, **kwargs) return ds def get_data_hashes(self, exclude_list=None, hash_type='sha1'): """Compute a the hash of data items exclude_list: list or None List of attributes to skip. if None, skips ['metadata'] hash_type: {'sha1', 'md5', 'sha256'} Algorithm to use for hashing """ if exclude_list is None: exclude_list = ['metadata'] ret = {'hash_type': hash_type} for key, value in self.items(): if key in exclude_list: continue ret[f"{key}_hash"] = joblib.hash(value, hash_name=hash_type) return ret def dump(self, file_base=None, dump_path=None, hash_type='sha1', force=True, create_dirs=True, dump_metadata=True): """Dump a dataset. Note, this dumps a separate copy of the metadata structure, so that metadata can be looked up without loading the entire dataset, which could be large dump_metadata: boolean If True, also dump a standalone copy of the metadata. Useful for checking metadata without reading in the (potentially large) dataset itself file_base: string Filename stem. By default, just the dataset name hash_type: {'sha1', 'md5'} Hash function to use for hashing data/labels dump_path: path. (default: `paths['processed_data_path']`) Directory where data will be dumped. force: boolean If False, raise an exception if the file already exists If True, overwrite any existing files create_dirs: boolean If True, `dump_path` will be created (if necessary) """ if dump_path is None: dump_path = paths['processed_data_path'] dump_path = pathlib.Path(dump_path) if file_base is None: file_base = self.name metadata = self['metadata'] metadata_filename = file_base + '.metadata' dataset_filename = file_base + '.dataset' metadata_fq = dump_path / metadata_filename data_hashes = self.get_data_hashes(hash_type=hash_type) self['metadata'] = {**self['metadata'], **data_hashes} # check for a cached version if metadata_fq.exists() and force is not True: logger.warning(f"Existing metatdata file found: {metadata_fq}") cached_metadata = joblib.load(metadata_fq) # are we a subset of the cached metadata? (Py3+ only) if metadata.items() <= cached_metadata.items(): raise Exception(f'Dataset with matching metadata exists already. ' 'Use `force=True` to overwrite, or change one of ' '`dataset.metadata` or `file_base`') else: raise Exception(f'Metadata file {metadata_filename} exists ' 'but metadata has changed. ' 'Use `force=True` to overwrite, or change ' '`file_base`') if create_dirs: os.makedirs(metadata_fq.parent, exist_ok=True) if dump_metadata: with open(metadata_fq, 'wb') as fo: joblib.dump(metadata, fo) logger.debug(f'Wrote Dataset Metadata: {metadata_filename}') dataset_fq = dump_path / dataset_filename with open(dataset_fq, 'wb') as fo: joblib.dump(self, fo) logger.debug(f'Wrote Dataset: {dataset_filename}') class DataSource(object): """Representation of a data source""" def __init__(self, name='datasource', parse_function=None, dataset_dir=None, file_list=None): """Create a DataSource Parameters ---------- name: str name of dataset parse_function: func (or partial) Function that will be called to process raw data into usable Dataset dataset_dir: path default location for raw files file_list: list list of file_dicts associated with this DataSource. Valid keys for each file_dict include: url: (optional) URL of resource to be fetched hash_type: {'sha1', 'md5', 'sha256'} Type of hash function used to verify file integrity hash_value: string Value of hash used to verify file integrity file_name: string (optional) filename to use when saving file locally. If omitted, it will be inferred from url or source_file name: string or {'DESCR', 'LICENSE'} (optional) description of the file. of DESCR or LICENSE, will be used as metadata unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. """ if file_list is None: file_list = [] if dataset_dir is None: dataset_dir = paths['raw_data_path'] if parse_function is None: parse_function = process_dataset_default self.name = name self.file_dict = {infer_filename(**item):item for item in file_list} self.parse_function = parse_function self.dataset_dir = dataset_dir # sklearn-style attributes. Usually these would be set in fit() self.fetched_ = False self.fetched_files_ = [] self.unpacked_ = False self.unpack_path_ = None @property def file_list(self): """For backwards compatibility while replacing the file_list with a file_dict""" logger.warning("file_list is deprecated. Use file_dict instead") return list(self.file_dict.values()) def add_metadata(self, filename=None, contents=None, metadata_path=None, kind='DESCR', unpack_action='copy', force=False): """Add metadata to a DataSource filename: create metadata entry from contents of this file contents: create metadata entry from this string metadata_path: (default `paths['raw_data_path']`) Where to store metadata kind: {'DESCR', 'LICENSE'} unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. force: boolean (default False) If True, overwrite an existing entry for this file """ if metadata_path is None: metadata_path = paths['raw_data_path'] else: metadata_path = pathlib.Path(metadata_path) filename_map = { 'DESCR': f'{self.name}.readme', 'LICENSE': f'{self.name}.license', } if kind not in filename_map: raise Exception(f'Unknown kind: {kind}. Must be one of {filename_map.keys()}') if filename is not None: filelist_entry = { 'fetch_action': 'copy', 'file_name': str(filename), 'name': kind, } elif contents is not None: filelist_entry = { 'contents': contents, 'fetch_action': 'create', 'file_name': filename_map[kind], 'name': kind, } else: raise Exception(f'One of `filename` or `contents` is required') if unpack_action: filelist_entry.update({'unpack_action': unpack_action}) fn = filelist_entry['file_name'] if fn in self.file_dict and not force: raise Exception(f"{fn} already exists in file_dict. Set `force=True` to overwrite.") self.file_dict[fn] = filelist_entry self.fetched_ = False def add_manual_download(self, message=None, *, hash_type='sha1', hash_value=None, name=None, file_name=None, unpack_action=None, force=False): """Add a manual download step to the file list. Some datasets must be downloaded manually (usually ones that require opting-in to a specific set of terms and conditions). This method displays a message indicating how the user can manually download the file, and from where. message: string Message to be displayed to the user. This message indicates how to download the indicated dataset. hash_type: {'sha1', 'md5', 'sha256'} hash_value: string. required Hash, computed via the algorithm specified in `hash_type` file_name: string, required Name of destination file. relative to paths['raw_data_dir'] name: str text description of this file. force: boolean (default False) If True, overwrite an existing entry for this file unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. """ if hash_value is None: raise Exception("You must specify a `hash_value` " "for a manual download") if file_name is None: raise Exception("You must specify a file_name for a manual download") if file_name in self.file_dict and not force: raise Exception(f"{file_name} already in file_dict. Use `force=True` to overwrite") fetch_dict = { 'fetch_action': 'message', 'file_name': file_name, 'hash_type': hash_type, 'hash_value': hash_value, 'message': message, 'name': name, } if unpack_action: fetch_dict.update({'unpack_action': unpack_action}) self.file_dict[file_name] = fetch_dict self.fetched_ = False def add_file(self, source_file=None, *, hash_type='sha1', hash_value=None, name=None, file_name=None, unpack_action=None, force=False): """ Add a file to the file list. This file must exist on disk, as there is no method specified for fetching it. This is useful when the data source requires an offline procedure for downloading. hash_type: {'sha1', 'md5', 'sha256'} hash_value: string or None if None, hash will be computed from specified file file_name: string Name of destination file. relative to paths['raw_data_dir'] name: str text description of this file. source_file: path file to be copied force: boolean (default False) If True, overwrite an existing entry for this file unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. """ if source_file is None: raise Exception("`source_file` is required") source_file = pathlib.Path(source_file) if not source_file.exists(): logger.warning(f"{source_file} not found on disk") file_name = infer_filename(file_name=file_name, source_file=source_file) if hash_value is None: logger.debug(f"Hash unspecified. Computing {hash_type} hash of {source_file.name}") hash_value = hash_file(source_file, algorithm=hash_type).hexdigest() fetch_dict = { 'fetch_action': 'copy', 'file_name': file_name, 'hash_type': hash_type, 'hash_value': hash_value, 'name': name, 'source_file': str(source_file), } if unpack_action: fetch_dict.update({'unpack_action': unpack_action}) existing_files = [f['source_file'] for k,f in self.file_dict.items()] existing_hashes = [f['hash_value'] for k,f in self.file_dict.items() if f['hash_value']] if file_name in self.file_dict and not force: raise Exception(f"{file_name} already in file_dict. Use `force=True` to add anyway.") if str(source_file.name) in existing_files and not force: raise Exception(f"source file: {source_file} already in file list. Use `force=True` to add anyway.") if hash_value in existing_hashes and not force: raise Exception(f"file with hash {hash_value} already in file list. Use `force=True` to add anyway.") logger.warning("Reproducibility Issue: add_file is often not reproducible. If possible, use add_manual_download instead") self.file_dict[file_name] = fetch_dict self.fetched_ = False def add_url(self, url=None, *, hash_type='sha1', hash_value=None, name=None, file_name=None, force=False, unpack_action=None): """ Add a URL to the file list hash_type: {'sha1', 'md5', 'sha256'} hash_value: string or None if None, hash will be computed from downloaded file file_name: string or None Name of downloaded file. If None, will be the last component of the URL url: string URL to fetch name: str text description of this file. force: boolean (default False) If True, overwrite an existing entry for this file unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. """ if url is None: raise Exception("`url` is required") file_name = infer_filename(file_name=file_name, url=url) fetch_dict = { 'fetch_action': 'url', 'file_name': file_name, 'hash_type': hash_type, 'hash_value': hash_value, 'name': name, 'url': url, } if unpack_action: filelist_entry.update({'unpack_action': unpack_action}) if file_name in self.file_dict and not force: raise Exception(f"{file_name} already in file_dict. Use `force=True` to add anyway.") self.file_dict[file_name] = fetch_dict self.fetched_ = False def dataset_opts(self, metadata=None, **kwargs): """Convert raw DataSource files into a Dataset constructor dict Parameters ---------- metadata: dict or None If None, an empty metadata dictionary will be used. **kwargs: additional parameters to be passed to `extract_func` Returns ------- Dictionary containing the following keys: dataset_name: (string) `dataset_name` that was passed to the function metadata: (dict) dict containing the input `metadata` key/value pairs, and (optionally) additional information about this raw dataset data: array-style object Often a `numpy.ndarray` or `pandas.DataFrame` target: (optional) vector-style object for supervised learning problems, the target vector associated with `data` """ if metadata is None: metadata = {} data, target = None, None if self.parse_function is None: logger.warning("No `parse_function` defined. `data` and `target` will be None") else: data, target, metadata = self.parse_function(metadata=metadata, **kwargs) dset_opts = { 'dataset_name': self.name, 'metadata': metadata, 'data': data, 'target': target, } return dset_opts def fetch(self, fetch_path=None, force=False): """Fetch files in the `file_dict` to `raw_data_dir` and check hashes. Parameters ---------- fetch_path: None or string By default, assumes dataset_dir force: Boolean If True, ignore the cache and re-download the fetch each time """ if self.fetched_ and force is False: # validate the downloaded files: for filename, item in self.file_dict.items(): raw_data_file = paths['raw_data_path'] / filename if not raw_data_file.exists(): logger.warning(f"{raw_data_file.name} missing. Invalidating fetch cache") self.fetched_ = False break raw_file_hash = hash_file(raw_data_file, algorithm=item['hash_type']).hexdigest() if raw_file_hash != item['hash_value']: logger.warning(f"{raw_data_file.name} {item['hash_type']} hash invalid ({raw_file_hash} != {item['hash_value']}). Invalidating fetch cache.") self.fetched_ = False break else: logger.debug(f'Data Source {self.name} is already fetched. Skipping') return if fetch_path is None: fetch_path = self.dataset_dir else: fetch_path = pathlib.Path(fetch_path) self.fetched_ = False self.fetched_files_ = [] for key, item in self.file_dict.items(): status, result, hash_value = fetch_file(**item) logger.debug(f"Fetching {key}: status:{status}") if status: # True (cached) or HTTP Code (successful download) item['hash_value'] = hash_value item['file_name'] = result.name self.fetched_files_.append(result) else: if item.get('fetch_action', False) != 'message': logger.error(f"fetch of {key} returned: {result}") break else: self.fetched_ = True self.unpacked_ = False return self.fetched_ def raw_file_list(self, return_hashes=False): """Returns the list of raw files. Parameters ---------- return_hashes: Boolean If True, returns tuples (filename, hash_type, hash_value). If False (default), return filenames only Returns the list of raw files that will be present once data is successfully fetched""" if return_hashes: return [(key, item['hash_type'], item['hash_value']) \ for (key, item) in self.file_dict.items()] else: return [key for key in self.file_dict] def unpack(self, unpack_path=None, force=False): """Unpack fetched files to interim dir""" if not self.fetched_: logger.debug("unpack() called before fetch()") self.fetch() if self.unpacked_ and force is False: logger.debug(f'Data Source {self.name} is already unpacked. Skipping') else: if unpack_path is None: unpack_path = paths['interim_data_path'] / self.name else: unpack_path = pathlib.Path(unpack_path) for filename, item in self.file_dict.items(): unpack(filename, dst_dir=unpack_path, unpack_action=item.get('unpack_action', None)) self.unpacked_ = True self.unpack_path_ = unpack_path return self.unpack_path_ def process(self, cache_path=None, force=False, return_X_y=False, use_docstring=False, **kwargs): """Turns the data source into a fully-processed Dataset object. This generated Dataset object is cached using joblib, so subsequent calls to process with the same file_list and kwargs should be fast. Parameters ---------- cache_path: path Location of joblib cache. force: boolean If False, use a cached object (if available). If True, regenerate object from scratch. return_X_y: boolean if True, returns (data, target) instead of a `Dataset` object. use_docstring: boolean If True, the docstring of `self.parse_function` is used as the Dataset DESCR text. """ if not self.unpacked_: logger.debug("process() called before unpack()") self.unpack() if cache_path is None: cache_path = paths['interim_data_path'] else: cache_path = pathlib.Path(cache_path) # If any of these things change, recreate and cache a new Dataset meta_hash = self.to_hash(**kwargs) dset = None dset_opts = {} if force is False: try: dset = Dataset.load(meta_hash, data_path=cache_path) logger.debug(f"Found cached Dataset for {self.name}: {meta_hash}") except FileNotFoundError: logger.debug(f"No cached Dataset found. Re-creating {self.name}") if dset is None: metadata = self.default_metadata(use_docstring=use_docstring) supplied_metadata = kwargs.pop('metadata', {}) dset_opts = self.dataset_opts(metadata={**metadata, **supplied_metadata}, **kwargs) dset = Dataset(**dset_opts) dset.dump(dump_path=cache_path, file_base=meta_hash) if return_X_y: return dset.data, dset.target return dset def default_metadata(self, use_docstring=False): """Returns default metadata derived from this DataSource This sets the dataset_name, and fills in `license` and `descr` fields if they are present, either on disk, or in the file list Parameters ---------- use_docstring: boolean If True, the docstring of `self.parse_function` is used as the Dataset DESCR text. Returns ------- Dict of metadata key/value pairs """ metadata = {} optmap = { 'DESCR': 'descr', 'LICENSE': 'license', } filemap = { 'license': f'{self.name}.license', 'descr': f'{self.name}.readme' } for key, fetch_dict in self.file_dict.items(): name = fetch_dict.get('name', None) # if metadata is present in the URL list, use it if name in optmap: txtfile = get_dataset_filename(fetch_dict) with open(paths['raw_data_path'] / txtfile, 'r') as fr: metadata[optmap[name]] = fr.read() if use_docstring: func = partial(self.parse_function) fqfunc, invocation = partial_call_signature(func) metadata['descr'] = f'Data processed by: {fqfunc}\n\n>>> ' + \ f'{invocation}\n\n>>> help({func.func.__name__})\n\n' + \ f'{func.func.__doc__}' metadata['dataset_name'] = self.name return metadata def to_hash(self, ignore=None, hash_type='sha1', **kwargs): """Compute a hash for this object. converts this object to a dict, and hashes the result, adding or removing keys as specified. hash_type: {'md5', 'sha1', 'sha256'} Hash algorithm to use ignore: list list of keys to ignore kwargs: key/value pairs to add before hashing """ if ignore is None: ignore = ['dataset_dir'] my_dict = {**self.to_dict(), **kwargs} for key in ignore: my_dict.pop(key, None) return joblib.hash(my_dict, hash_name=hash_type) def __hash__(self): return hash(self.to_hash()) def to_dict(self): """Convert a DataSource to a serializable dictionary""" parse_function_dict = serialize_partial(self.parse_function) obj_dict = { 'url_list': list(self.file_dict.values()), **parse_function_dict, 'name': self.name, 'dataset_dir': str(self.dataset_dir) } return obj_dict @classmethod def from_name(cls, datasource_name, datasource_file='datasources.json', datasource_path=None): """Create a DataSource from a dictionary key name. The `datasource_file` is a json file mapping datasource_name to its dictionary representation. Parameters ---------- datasource_name: str Name of data source. Used as the key in the on-disk key_file key_file_path: Location of key_file (json dict containing data source defintion) if None, use source code module: src/data/{key_file_name} key_file_name: Name of json file containing key/dict map """ datasources, _ = available_datasources(datasource_file=datasource_file, datasource_path=datasource_path, keys_only=False) return cls.from_dict(datasources[datasource_name]) @classmethod def from_dict(cls, obj_dict): """Create a DataSource from a dictionary. name: str dataset name dataset_dir: path pathname to load and save dataset obj_dict: dict Should contain url_list, and parse_function_{name|module|args|kwargs} keys, name, and dataset_dir """ file_list = obj_dict.get('url_list', []) parse_function = deserialize_partial(obj_dict) name = obj_dict['name'] dataset_dir = obj_dict.get('dataset_dir', None) return cls(name=name, parse_function=parse_function, dataset_dir=dataset_dir, file_list=file_list)